Stock Market Prediction

In [1]:
import math,random
import quandl
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,SGDRegressor,BayesianRidge,ARDRegression,PassiveAggressiveRegressor,TheilSenRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,StackingRegressor,VotingRegressor
from sklearn.neural_network import MLPRegressor
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
In [2]:
stock = "MSFT"
daysToForecast = 251*5
In [3]:
def getStockData(stock):
    quandl.ApiConfig.api_key = "qWcicxSctVxrP9PhyneG"
    allData = quandl.get('WIKI/'+stock)
    return allData
In [4]:
def FormatDataForModel(dataArray):
    dataArray = dataArray[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]
    dataArray['HL_PCT'] = (dataArray['Adj. High'] - dataArray['Adj. Close']) / dataArray['Adj. Close'] * 100.0
    dataArray['PCT_change'] = (dataArray['Adj. Close'] - dataArray['Adj. Open']) / dataArray['Adj. Open'] * 100.0
    dataArray = dataArray[['Adj. Close', 'HL_PCT', 'PCT_change','Adj. Volume']]
    dataArray.fillna(-99999, inplace=True)
    return dataArray
In [5]:
def PreprocessData(mlData,daysToForecast):
    forecast_col = 'Adj. Close'
    forecast_out = int(math.ceil(0.12*daysToForecast))
    mlData['label'] = mlData[forecast_col].shift(-forecast_out)
    #mlData.dropna(inplace=True)
    X = np.array(mlData.drop(['label'],1))
    X = preprocessing.scale(X)
    X_data = X[-daysToForecast:]
    X = X[:-daysToForecast]
    forecastData = mlData[-daysToForecast:]
    trainData= mlData[:-daysToForecast]
    y = np.array(trainData['label'])
    response = [X,y,X_data,forecastData]
    return response
In [6]:
def TrainAndPredict(model,X,y,X_data):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    prediction = model.predict(X_data)
    return accuracy, prediction
In [7]:
def addPredictionToForecast(prediction,forecastData):
    forecastData = forecastData[['Adj. Close']]
    forecastData = forecastData.rename(columns={'Adj. Close':'EOD'})
    forecastData['prediction'] = prediction[:]
    return forecastData
In [8]:
def GraphPredictions(forecastData,stock):
    fig = px.line(forecastData)
    fig.update_layout(title=stock,
                   xaxis_title='Time',
                   yaxis_title='Price')
    fig.show()
In [9]:
def GraphAllData(allData,forecastData,stock):
    result = pd.concat([allData['Adj. Close'],forecastData['prediction']],axis =1, sort=False)
    fig = px.line(result)
    fig.update_layout(title=stock,
                   xaxis_title='Time',
                   yaxis_title='Price')
    fig.show()
In [10]:
allData = getStockData(stock)
mlData = FormatDataForModel(allData)
X,y,X_data,forecastData = PreprocessData(mlData,daysToForecast)
model = LinearRegression()
accuracy,prediction=TrainAndPredict(model,X,y,X_data)
forecastData = addPredictionToForecast(prediction,forecastData)
In [11]:
print(accuracy)
0.8822335071477906
In [12]:
GraphPredictions(forecastData,stock)
In [13]:
GraphAllData(allData,forecastData,stock)
In [14]:
stock_list = ['AAPL', 'IBM', 'MSFT', 'WMT','AMZN','TSLA','PLUG','GOOGL','FB','CRM']
In [15]:
for stock in stock_list:
    print("Stock: ", stock)
    allData = getStockData(stock)
    mlData = FormatDataForModel(allData)
    X,y,X_data,forecastData = PreprocessData(mlData,daysToForecast)
    model = LinearRegression()
    accuracy,prediction=TrainAndPredict(model,X,y,X_data)
    print("Accuracy: ", accuracy)
    forecastData = addPredictionToForecast(prediction,forecastData)
    GraphPredictions(forecastData,stock)
    GraphAllData(allData,forecastData,stock)
Stock:  AAPL
Accuracy:  0.8946944997630841
Stock:  IBM
Accuracy:  0.9557601419922813
Stock:  MSFT
Accuracy:  0.8838496738667084
Stock:  WMT
Accuracy:  0.9721587226933674
Stock:  AMZN
Accuracy:  0.8936439507961701
Stock:  TSLA
Accuracy:  0.40563632968374486
Stock:  PLUG
Accuracy:  0.43066308605824877
Stock:  GOOGL
Accuracy:  0.484359525211401
Stock:  FB
Accuracy:  0.21835497535703485
Stock:  CRM
Accuracy:  0.8682241845053769
In [16]:
model_list = [[LinearRegression(), "LinearRegression"],
              [SVR(),"SupportVectorRegression"],
              [MLPRegressor(),"MLPRegressor"],
              [SGDRegressor(),"SGDRegressor"],
              [BayesianRidge(),"BayesianRidge"],
              [ARDRegression(),"ARDRegression"],
              [PassiveAggressiveRegressor(),"PassiveAggressiveRegressor"],
              [TheilSenRegressor(),"TheilSenRegressor"]]
In [17]:
model_results = []
stock_dfs = []
for stock in stock_list:
    print("Stock: ", stock)
    allData = getStockData(stock)
    mlData = FormatDataForModel(allData)
    X,y,X_data,forecastData = PreprocessData(mlData,daysToForecast)
    df_stocks = forecastData[['Adj. Close']]
    df_stocks = df_stocks.rename(columns={'Adj. Close':stock+' Actual'})
    for model,name in model_list:
        accuracy,prediction=TrainAndPredict(model,X,y,X_data)
        print("Model: ",name , "  ","Accuracy:", accuracy)
        model_results.append((name,stock,accuracy))
        df_stocks[name] = prediction[:]
    stock_dfs.append((stock,df_stocks))
Stock:  AAPL
Model:  LinearRegression    Accuracy: 0.9148437640287511
Model:  SupportVectorRegression    Accuracy: 0.9372633870206728
Model:  MLPRegressor    Accuracy: 0.9595754417826816
Model:  SGDRegressor    Accuracy: 0.9091097315130313
Model:  BayesianRidge    Accuracy: 0.9161172826455627
Model:  ARDRegression    Accuracy: 0.8978638836444277
Model:  PassiveAggressiveRegressor    Accuracy: 0.8173886925622273
Model:  TheilSenRegressor    Accuracy: 0.8883585760505722
Stock:  IBM
Model:  LinearRegression    Accuracy: 0.9595087705522414
Model:  SupportVectorRegression    Accuracy: 0.9453130795256263
Model:  MLPRegressor    Accuracy: 0.9650008995477554
Model:  SGDRegressor    Accuracy: 0.9603130559505415
Model:  BayesianRidge    Accuracy: 0.9565918504624881
Model:  ARDRegression    Accuracy: 0.9568594311940128
Model:  PassiveAggressiveRegressor    Accuracy: 0.9535221492646492
Model:  TheilSenRegressor    Accuracy: 0.9603805052811474
Stock:  MSFT
Model:  LinearRegression    Accuracy: 0.894271640844448
Model:  SupportVectorRegression    Accuracy: 0.8959939451987312
Model:  MLPRegressor    Accuracy: 0.9174629337782083
Model:  SGDRegressor    Accuracy: 0.8905906788267222
Model:  BayesianRidge    Accuracy: 0.8933265675806625
Model:  ARDRegression    Accuracy: 0.8893085475991143
Model:  PassiveAggressiveRegressor    Accuracy: 0.8172830604999448
Model:  TheilSenRegressor    Accuracy: 0.8861127001827853
Stock:  WMT
Model:  LinearRegression    Accuracy: 0.971088281411419
Model:  SupportVectorRegression    Accuracy: 0.9624799281380191
Model:  MLPRegressor    Accuracy: 0.9775282373861746
Model:  SGDRegressor    Accuracy: 0.9719939725524847
Model:  BayesianRidge    Accuracy: 0.9724562036085467
Model:  ARDRegression    Accuracy: 0.9693612603283626
Model:  PassiveAggressiveRegressor    Accuracy: 0.9632657216915009
Model:  TheilSenRegressor    Accuracy: 0.9715809137865831
Stock:  AMZN
Model:  LinearRegression    Accuracy: 0.8978331427353866
Model:  SupportVectorRegression    Accuracy: 0.5764659977852034
Model:  MLPRegressor    Accuracy: 0.9006354459925908
Model:  SGDRegressor    Accuracy: 0.8980926370956147
Model:  BayesianRidge    Accuracy: 0.8966390980163315
Model:  ARDRegression    Accuracy: 0.9009321281852483
Model:  PassiveAggressiveRegressor    Accuracy: 0.9011054675722021
Model:  TheilSenRegressor    Accuracy: 0.8907403303390521
Stock:  TSLA
Model:  LinearRegression    Accuracy: 0.41344558458942793
Model:  SupportVectorRegression    Accuracy: -0.16027872131019238
Model:  MLPRegressor    Accuracy: 0.028530318229792573
Model:  SGDRegressor    Accuracy: 0.002314821134315248
Model:  BayesianRidge    Accuracy: 0.4112411484927925
Model:  ARDRegression    Accuracy: 0.3522147669150808
Model:  PassiveAggressiveRegressor    Accuracy: -0.1837509428185118
Model:  TheilSenRegressor    Accuracy: 0.20336168746350913
Stock:  PLUG
Model:  LinearRegression    Accuracy: 0.49504973777914374
Model:  SupportVectorRegression    Accuracy: 0.3436113334495654
Model:  MLPRegressor    Accuracy: 0.5896577807330884
Model:  SGDRegressor    Accuracy: 0.45195079939096316
Model:  BayesianRidge    Accuracy: 0.5476024313769124
Model:  ARDRegression    Accuracy: 0.4726240062287961
Model:  PassiveAggressiveRegressor    Accuracy: 0.4681074956375638
Model:  TheilSenRegressor    Accuracy: 0.007915610336532541
Stock:  GOOGL
Model:  LinearRegression    Accuracy: 0.552491969458385
Model:  SupportVectorRegression    Accuracy: 0.37453020683675964
Model:  MLPRegressor    Accuracy: -0.4168116334253531
Model:  SGDRegressor    Accuracy: 0.5355866584576547
Model:  BayesianRidge    Accuracy: 0.5163115706693089
Model:  ARDRegression    Accuracy: 0.5447517896440515
Model:  PassiveAggressiveRegressor    Accuracy: 0.5646340614067287
Model:  TheilSenRegressor    Accuracy: 0.5537525267569205
Stock:  FB
Model:  LinearRegression    Accuracy: 0.20596868581381678
Model:  SupportVectorRegression    Accuracy: -0.20636375670312335
Model:  MLPRegressor    Accuracy: -4.998608303428433
Model:  SGDRegressor    Accuracy: -0.18729479211272815
Model:  BayesianRidge    Accuracy: 0.20870933868709707
Model:  ARDRegression    Accuracy: 0.11010624784490108
Model:  PassiveAggressiveRegressor    Accuracy: 0.11755396857802236
Model:  TheilSenRegressor    Accuracy: 0.28449943982111303
Stock:  CRM
Model:  LinearRegression    Accuracy: 0.8580051845632015
Model:  SupportVectorRegression    Accuracy: 0.825772135737159
Model:  MLPRegressor    Accuracy: 0.8427955656877734
Model:  SGDRegressor    Accuracy: 0.8594314073477969
Model:  BayesianRidge    Accuracy: 0.8692913513851444
Model:  ARDRegression    Accuracy: 0.8577609113608977
Model:  PassiveAggressiveRegressor    Accuracy: 0.7260537904672781
Model:  TheilSenRegressor    Accuracy: 0.8570739949442255
In [18]:
model_names = []
for model,name in model_list:
    model_names.append(name)
df = pd.DataFrame(columns=stock_list,index=model_names)
for i in model_results:
    df.at[i[0],i[1]] = i[2]
df
Out[18]:
AAPL IBM MSFT WMT AMZN TSLA PLUG GOOGL FB CRM
LinearRegression 0.914844 0.959509 0.894272 0.971088 0.897833 0.413446 0.49505 0.552492 0.205969 0.858005
SupportVectorRegression 0.937263 0.945313 0.895994 0.96248 0.576466 -0.160279 0.343611 0.37453 -0.206364 0.825772
MLPRegressor 0.959575 0.965001 0.917463 0.977528 0.900635 0.0285303 0.589658 -0.416812 -4.99861 0.842796
SGDRegressor 0.90911 0.960313 0.890591 0.971994 0.898093 0.00231482 0.451951 0.535587 -0.187295 0.859431
BayesianRidge 0.916117 0.956592 0.893327 0.972456 0.896639 0.411241 0.547602 0.516312 0.208709 0.869291
ARDRegression 0.897864 0.956859 0.889309 0.969361 0.900932 0.352215 0.472624 0.544752 0.110106 0.857761
PassiveAggressiveRegressor 0.817389 0.953522 0.817283 0.963266 0.901105 -0.183751 0.468107 0.564634 0.117554 0.726054
TheilSenRegressor 0.888359 0.960381 0.886113 0.971581 0.89074 0.203362 0.00791561 0.553753 0.284499 0.857074
In [19]:
highest = []
for i in df.columns:
    highest.append([i, df[i].astype(float).idxmax(), df[i].max()])
df_high = pd.DataFrame(highest, columns=["Stock","Model","Accuracy"])
df_high
Out[19]:
Stock Model Accuracy
0 AAPL MLPRegressor 0.959575
1 IBM MLPRegressor 0.965001
2 MSFT MLPRegressor 0.917463
3 WMT MLPRegressor 0.977528
4 AMZN PassiveAggressiveRegressor 0.901105
5 TSLA LinearRegression 0.413446
6 PLUG MLPRegressor 0.589658
7 GOOGL PassiveAggressiveRegressor 0.564634
8 FB TheilSenRegressor 0.284499
9 CRM BayesianRidge 0.869291
In [20]:
average = df.mean(axis=1)
In [21]:
average.sort_values(ascending=False)
Out[21]:
BayesianRidge                 0.718829
LinearRegression              0.716251
ARDRegression                 0.695178
TheilSenRegressor             0.650378
SGDRegressor                  0.629209
PassiveAggressiveRegressor    0.614516
SupportVectorRegression       0.549479
MLPRegressor                  0.076577
dtype: float64
In [22]:
for stock,stock_df in stock_dfs:
    fig = px.line(stock_df)
    fig.update_layout(title=stock,
                   xaxis_title='Time',
                   yaxis_title='Price')
    fig.show()